library(aplore3)
library(ggplot2)
library(plotly)
library(RColorBrewer)
library(pheatmap)
library(cluster)
library(ggcorrplot)
library(dplyr)
library(tidyr)

Data Format: A data.frame with 500 rows and 18 variables such as:

priorfrac - If the patient previously had a fracture
age
weight
height
bmi
premeno
momfrac
armassist
smoke
raterisk
fracscore
fracture
bonemed - Bone medications at enrollment (1: No, 2: Yes)
bonemed_fu - Bone medications at follow-up (1: No, 2: Yes)
bonetreat - Bone medications both at enrollment and follow-up (1: No, 2: Yes)

head(glow_bonemed)
##   sub_id site_id phy_id priorfrac age weight height      bmi premeno momfrac
## 1      1       1     14        No  62   70.3    158 28.16055      No      No
## 2      2       4    284        No  65   87.1    160 34.02344      No      No
## 3      3       6    305       Yes  88   50.8    157 20.60936      No     Yes
## 4      4       6    309        No  82   62.1    160 24.25781      No      No
## 5      5       1     37        No  61   68.0    152 29.43213      No      No
## 6      6       5    299       Yes  67   68.0    161 26.23356      No      No
##   armassist smoke raterisk fracscore fracture bonemed bonemed_fu bonetreat
## 1        No    No     Same         1       No      No         No        No
## 2        No    No     Same         2       No      No         No        No
## 3       Yes    No     Less        11       No      No         No        No
## 4        No    No     Less         5       No      No         No        No
## 5        No    No     Same         1       No      No         No        No
## 6        No   Yes     Same         4       No      No         No        No

Summary statistics for numeric variables

mysummary = glow_bonemed %>%
  select(age, weight, height, bmi, fracscore) %>%
  summarise_each(
    funs(min = min, 
    q25 = quantile(., 0.25), 
    median = median, 
    q75 = quantile(., 0.75), 
    max = max,
    mean = mean, 
    sd = sd,
    variance= var))

# reshape it using tidyr functions

clean.summary = mysummary %>% 
  gather(stat, val) %>%
  separate(stat, into = c("var", "stat"), sep = "_") %>%
  spread(stat, val) %>%
  select(var, min, max, mean, sd, variance)

print(clean.summary)
##         var       min       max      mean        sd   variance
## 1       age  55.00000  90.00000  68.56200  8.989537  80.811780
## 2       bmi  14.87637  49.08241  27.55303  5.973958  35.688178
## 3 fracscore   0.00000  11.00000   3.69800  2.495446   6.227251
## 4    height 134.00000 199.00000 161.36400  6.355493  40.392289
## 5    weight  39.90000 127.00000  71.82320 16.435992 270.141825

Summary statistics for categorical variables

summary(glow_bonemed %>% select(priorfrac, premeno, momfrac, armassist, smoke, raterisk, fracture, bonemed, bonemed_fu, bonetreat))
##  priorfrac premeno   momfrac   armassist smoke        raterisk   fracture 
##  No :374   No :403   No :435   No :312   No :465   Less   :167   No :375  
##  Yes:126   Yes: 97   Yes: 65   Yes:188   Yes: 35   Same   :186   Yes:125  
##                                                    Greater:147            
##  bonemed   bonemed_fu bonetreat
##  No :371   No :361    No :382  
##  Yes:129   Yes:139    Yes:118  
## 

Age vs Weight: As weight increases the average age decreases
Age vs Height: Weak correlation of as height increases age decreases
Age vs BMI: As bmi increases the average age decreases
Age vs fracscore: As age increases the average fracscore increases

Weight vs Height: As height increases the average weight increases
Weight vs BMI: As bmi increases the average weight increases
Weight vs fracscore: As fracscore increases the average Weight decreases

Height vs BMI: As bmi increases the average height and variance stay the same
Height vs fracscore: As fracscore increases the average height stays the same though variance might decrease

BMI vs fracscore: As fracscore increases the average bmi decreases

plot(glow_bonemed[, c(5:8, 14)])

Non of the following scatter plots show strong groupings for the fracture/no fracture categorical variable

ggplot(glow_bonemed, aes(x = age, y = bmi, color = fracture)) +
  geom_jitter() +
  labs(title = "BMI vs Age")

ggplot(glow_bonemed, aes(x = bmi, y = fracscore, color = fracture)) +
  geom_jitter() +
  labs(title = "Fracture Score vs BMI")

ggplot(glow_bonemed, aes(x = fracscore, y = age, color = fracture)) +
  geom_jitter() +
  labs(title = "Age vs Fracture Score")

ggplot(glow_bonemed, aes(x = weight, y = height, color = fracture)) +
  geom_jitter() +
  labs(title = "Height vs Weight")

Once again there doesn’t seem to be strong groupings of the fracture categorical variable

fracture3dplot = plot_ly(glow_bonemed, 
  x = ~age, 
  y = ~height, 
  z = ~bmi, 
  color = ~fracture, 
  colors = c('#0C4B8E', '#BF382A')) %>% add_markers()

fracture3dplot

There are so little “yes” fracture results that the plot isn’t very useful

## `geom_smooth()` using formula = 'y ~ x'

The boxplot shows that the mean fracscore seems to be slightly higher for smokers compared to non smokers

ggplot(glow_bonemed, aes(x = smoke, y = fracscore)) +
  geom_boxplot() +
  labs(title = "Fracture Score Summary Statistics for Smokers vs Non Smokers")

Plot confirms there is a strong correlation between age/fracscore, bmi/weight

corr_vars <- c("age", "weight", "height", "bmi", "fracscore")

corr_df <- glow_bonemed[, corr_vars]
corr_df <- cor(corr_df)

ggcorrplot(corr = corr_df, lab = TRUE, lab_size = 2,
  colors = c("#6D9EC1", "white", "#E46726")) +
  labs(title = "Correlation Between Variables") +
  theme(plot.title = element_text(hjust = .5),
  plot.subtitle = element_text(hjust = .5))

Clustering EDA

pheatmap(glow_bonemed[, c(5,8)], scale = "column", fontsize_row = 0.1, cluster_cols = F, legend = T, color = colorRampPalette(c("blue", "white", "red"), space = "rgb")(100))

pheatmap(glow_bonemed[, 5:8], scale = "column", fontsize_row = 0.1, cluster_cols = F, legend = T, color = colorRampPalette(c("blue", "white", "red"), space = "rgb")(100))

zScoreScale = scale(glow_bonemed[, 5:8])

zScoreDistance = dist(zScoreScale)

continuousVariableClustering = hclust(zScoreDistance, method = "complete")

plot(continuousVariableClustering)

Modeling

Age is the only statistically significant continuous variable at the alpha = 0.2 level (p < 0.0001)

model = glm(fracture ~ age + weight + height + bmi, data = glow_bonemed, family = "binomial")

summary(model)
## 
## Call:
## glm(formula = fracture ~ age + weight + height + bmi, family = "binomial", 
##     data = glow_bonemed)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3745  -0.7689  -0.6275  -0.1437   2.1822  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -13.29208   12.54412  -1.060    0.289    
## age           0.05263    0.01237   4.255 2.09e-05 ***
## weight       -0.09720    0.08559  -1.136    0.256    
## height        0.04914    0.07747   0.634    0.526    
## bmi           0.27450    0.22072   1.244    0.214    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 562.34  on 499  degrees of freedom
## Residual deviance: 532.92  on 495  degrees of freedom
## AIC: 542.92
## 
## Number of Fisher Scoring iterations: 4
corr_vars <- c("age", "weight", "height", "bmi", "fracscore")
pc.result<-prcomp(glow_bonemed[, corr_vars],scale.=TRUE)

#Eigen Vectors
pc.result$rotation
##                  PC1         PC2         PC3         PC4          PC5
## age        0.4947219  0.46742140 -0.15246583  0.71654567 -0.009160237
## weight    -0.5273035  0.46578775 -0.08840991  0.03240244 -0.704362523
## height    -0.2345770 -0.08196149 -0.93823245  0.01885633  0.240042129
## bmi       -0.4741030  0.51615173  0.24533677  0.05137380  0.667820563
## fracscore  0.4442985  0.53984137 -0.16872342 -0.69463484  0.013601399
#Eigen Values
eigenvals<-pc.result$sdev^2
eigenvals
## [1] 2.374116591 1.523507236 0.975710317 0.123469514 0.003196342
#Scree plot
par(mfrow = c(1,2))

plot(eigenvals,type = "l",
     main = "Scree Plot",
     ylab = "Eigen Values",
     xlab = "PC #")

plot(eigenvals / sum(eigenvals), 
     type = "l", main = "Scree Plot", 
     ylab = "Prop. Var. Explained", 
     xlab = "PC #", 
     ylim = c(0, 1))

cumulative.prop = cumsum(eigenvals / sum(eigenvals))

lines(cumulative.prop, lty = 2)

#Sources:
Hosmer, D.W., Lemeshow, S. and Sturdivant, R.X. (2013) Applied Logistic Regression, 3rd ed., New York: Wiley

https://cran.r-project.org/web/packages/aplore3/aplore3.pdf#page=11&zoom=100,132,90